library(RColorBrewer)
library(mgcv)
library(lme4)
library(mclust)
library(rstan)
library(gtools)
library(dplyr)
library(reshape2)
options(mc.cores = parallel::detectCores())
RUN_STAN=F
source('functions.R')
thresh_highSM = 0.8
thresh_lowSM = 0.2
dat_all=read.csv( file = 'dataset.csv')
writeLines(sprintf('The dataset contains %s patients with measured PfHRP2 and measured platelet counts from %s studies',
nrow(dat_all),
length(unique(dat_all$study))))
## The dataset contains 2649 patients with measured PfHRP2 and measured platelet counts from 4 studies
African_studies = c('Kilifi (Kenya)',
'Kampala (Uganda)',
'FEAST (Uganda)')
writeLines('Patients per study:')
## Patients per study:
print(table(dat_all$study))
##
## Bangladesh FEAST (Uganda) Kampala (Uganda) Kilifi (Kenya)
## 172 567 492 1418
dat_all$study_col = brewer.pal(n = 8, name = 'Dark2')[c(3,4,5,7)][as.numeric(as.factor(dat_all$study))]
mycols = adjustcolor(brewer.pal(n = 10, 'Paired')[c(2,4,6,10)],
alpha.f = .7)
Some data cleaning
hist(dat_all$hct, breaks = 50)
# get rid of outlier HCT values
ind = which(dat_all$hct <3 | dat_all$hct>50)
dat_all$hct[ind] = NA
table(!is.na(dat_all$hct))
##
## FALSE TRUE
## 6 2643
## PfHRP2 deletions - samples with positive parasite count by negative on the elisa
ind = which(dat_all$hrp2==0 & dat_all$para > 1000)
table(dat_all$study[ind])
##
## Bangladesh FEAST (Uganda) Kilifi (Kenya)
## 1 8 18
writeLines(sprintf('A total of %s samples have zero PfHRP2 but more than 1000 parasites per uL',length(ind)))
## A total of 27 samples have zero PfHRP2 but more than 1000 parasites per uL
dat_all = dat_all[-ind, ]
writeLines(sprintf('After excluding the HRP2 outliers, the dataset contains %s patients with measured PfHRP2 and measured platelet counts',
nrow(dat_all)))
## After excluding the HRP2 outliers, the dataset contains 2622 patients with measured PfHRP2 and measured platelet counts
Results for Table 1
table(dat_all$study)
##
## Bangladesh FEAST (Uganda) Kampala (Uganda) Kilifi (Kenya)
## 171 559 492 1400
xx=aggregate(age ~ study, dat_all, quantile, probs=c(0.25,.5,.75))
colnames(xx$age)=c('lower','median','upper')
xx$age = round(xx$age,1)
print(xx)
## study age.lower age.median age.upper
## 1 Bangladesh 23.5 30.0 45.0
## 2 FEAST (Uganda) 1.2 2.0 3.3
## 3 Kampala (Uganda) 2.2 3.3 4.6
## 4 Kilifi (Kenya) 1.4 2.4 3.7
aggregate(hrp2 ~ study, dat_all, length)
## study hrp2
## 1 Bangladesh 171
## 2 FEAST (Uganda) 559
## 3 Kampala (Uganda) 492
## 4 Kilifi (Kenya) 1400
aggregate(outcome ~ study, dat_all, function(x) round(100*mean(x),1))
## study outcome
## 1 Bangladesh 26.9
## 2 FEAST (Uganda) 11.4
## 3 Kampala (Uganda) 6.7
## 4 Kilifi (Kenya) 11.1
aggregate(platelet ~ study, dat_all, quantile, probs=c(0.25,.5,.75))
## study platelet.25% platelet.50% platelet.75%
## 1 Bangladesh 27.0 50.0 139.0
## 2 FEAST (Uganda) 74.5 165.0 326.0
## 3 Kampala (Uganda) 49.0 96.0 169.5
## 4 Kilifi (Kenya) 64.0 111.0 215.0
aggregate(hrp2 ~ study, dat_all, quantile, probs=c(0.25,.5,.75))
## study hrp2.25% hrp2.50% hrp2.75%
## 1 Bangladesh 1082.9050 2667.0400 6127.5550
## 2 FEAST (Uganda) 0.0000 174.7100 1952.6900
## 3 Kampala (Uganda) 588.0000 1838.4000 4097.4000
## 4 Kilifi (Kenya) 418.7393 2206.7408 5071.5299
aggregate(wbc ~ study, dat_all, quantile, probs=c(0.25,.5,.75))
## study wbc.25% wbc.50% wbc.75%
## 1 Bangladesh 6.900 9.000 11.000
## 2 FEAST (Uganda) 8.400 11.950 18.675
## 3 Kampala (Uganda) 7.500 10.400 15.300
## 4 Kilifi (Kenya) 8.900 12.550 19.000
ind_pos = which(dat_all$malaria_positive==1)
dat_all$log10_parasites = log10(dat_all$para+50)
xx=aggregate(para ~ study, dat_all[ind_pos, ],
quantile, probs=c(0.25,.5,.75))
xx$para = round(xx$para)
print(xx)
## study para.25% para.50% para.75%
## 1 Bangladesh 23550 148874 348540
## 2 FEAST (Uganda) 3640 37600 153680
## 3 Kampala (Uganda) 10635 42530 198540
## 4 Kilifi (Kenya) 6099 69824 316350
table(dat_all$study, dat_all$sickle)
##
## 1 2 3
## Bangladesh 0 0 0
## FEAST (Uganda) 466 46 21
## Kampala (Uganda) 463 4 23
## Kilifi (Kenya) 1348 41 7
# xx = aggregate(log10_parasites ~ study, dat_all[ind_pos, ], mean)
# xx[,2] = round(10^xx[,2])
# print(xx)
Correlation between the two biomarkers
writeLines('African sites: correlation:')
## African sites: correlation:
ind_Africa = dat_all$study %in% African_studies
res=cor.test(log10(dat_all$platelet[ind_Africa]),
log10(dat_all$hrp2+1)[ind_Africa])
res
##
## Pearson's product-moment correlation
##
## data: log10(dat_all$platelet[ind_Africa]) and log10(dat_all$hrp2 + 1)[ind_Africa]
## t = -31.892, df = 2449, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5690929 -0.5131183
## sample estimates:
## cor
## -0.5417058
log10(res$p.value)
## [1] -186.2477
round(res$estimate,2)
## cor
## -0.54
writeLines('Bangladesh: correlation:')
## Bangladesh: correlation:
res=cor.test(log10(dat_all$platelet[!ind_Africa]),
log10(dat_all$hrp2+1)[!ind_Africa])
res
##
## Pearson's product-moment correlation
##
## data: log10(dat_all$platelet[!ind_Africa]) and log10(dat_all$hrp2 + 1)[!ind_Africa]
## t = -4.9404, df = 169, p-value = 1.863e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.4797402 -0.2167256
## sample estimates:
## cor
## -0.3552439
log10(res$p.value)
## [1] -5.72988
round(res$estimate,2)
## cor
## -0.36
Summary plot of the biomarker data
mycols = rep('black',4)
dat_all$log10_platelet=log10(dat_all$platelet)
ind = which(dat_all$hrp2==0)
dat_all$hrp2[ind] = 10^rnorm(length(ind),0,.1)
dat_all$log10_hrp2 = log10(dat_all$hrp2)
par(las=1, family='serif', mfrow=c(2,2),cex.lab=1.5, cex.axis=1.5)
for(cc in unique(dat_all$study)){
plot(dat_all$log10_platelet,dat_all$log10_hrp2,
panel.first = grid(), xaxt='n', yaxt='n',
xlab='Platelet count (x1000 per uL)',type='n',
ylab='PfHRP2 (ng/mL)')
ind = dat_all$study==cc
title(paste(cc, ' (n=',sum(ind),')',sep = ''))
points(dat_all$log10_platelet[ind],
dat_all$log10_hrp2[ind], pch = 16,
col= adjustcolor(dat_all$study_col,.1)[ind])
points(dat_all$log10_platelet[ind],
dat_all$log10_hrp2[ind], pch = 1,
col= adjustcolor(dat_all$study_col,.5)[ind])
axis(1, at = log10(c(10, 20,100,1000)),
labels = c(10, 20,100,1000))
axis(1, at = log10(seq(20, 90, by =10)), labels = NA)
axis(1, at = log10(seq(200, 900, by =100)), labels = NA)
axis(2, at = 0:5,
labels = c(1, 10, expression(10^2),expression(10^3),
expression(10^4),expression(10^5)))
}
Analysis not including the FEAST study - only severe malaria studies
# make numeric matrix for stan input
ind_SMstudies = dat_all$study != 'FEAST (Uganda)' &
!is.na(dat_all$log10_parasites)
X = dat_all[ind_SMstudies,
c('log10_platelet',
'log10_hrp2',
'log10_parasites')]
site_index_SMstudies = as.numeric(as.factor(dat_all$study[ind_SMstudies]))
table(site_index_SMstudies)
## site_index_SMstudies
## 1 2 3
## 170 484 1398
stan_dat_all =
list(N = nrow(X),
y = X,
Ntest = ncol(X),
K_sites = max(site_index_SMstudies),
site = site_index_SMstudies,
prior_mu = matrix(c(log10(75), log10(200),
log10(250),log10(3000),
3,5),
nrow = ncol(X), byrow = T),
prior_sigma_mu = matrix(c(0.1, 0.1,
0.2, 0.2,
0.25, 0.25),
nrow = ncol(X), byrow = T),
beta_prior_prev = matrix(data = c(19,1,
14,7,
14,7),
nrow = max(site_index_SMstudies),
byrow = T),
cluster = matrix(data = c(1,2,
2,1,
2,1),
nrow = ncol(X), byrow = T))
print(stan_dat_all$N)
## [1] 2052
# just platelets and HRP2 (reference)
ind_SMstudies2 = dat_all$study != 'FEAST (Uganda)'
X = dat_all[ind_SMstudies2,c('log10_platelet','log10_hrp2')]
stan_dat_plt_hrp2 = stan_dat_all
stan_dat_plt_hrp2$y = X
stan_dat_plt_hrp2$N = nrow(X)
stan_dat_plt_hrp2$site = as.numeric(as.factor(dat_all$study[ind_SMstudies2]))
stan_dat_plt_hrp2$Ntest = 2
stan_dat_plt_hrp2$prior_mu = stan_dat_all$prior_mu[1:2,]
stan_dat_plt_hrp2$prior_sigma_mu = stan_dat_all$prior_sigma_mu[1:2,]
stan_dat_plt_hrp2$cluster = stan_dat_all$cluster[1:2,]
# compile stan model
lc_mod = stan_model(file = 'Latent_class_continuous_2.stan')
fname_all = paste0('Rout/', 'mod2_LC_all.stanfit')
if(RUN_STAN | !file.exists(fname_all)){
out_all = sampling(lc_mod, data=stan_dat_all, thin=4)
save(out_all, file = fname_all)
} else {
load(fname_all)
}
fname_plt_hrp2 = paste0('Rout/', 'mod2_LC_plt_hrp2.stanfit')
if(RUN_STAN | !file.exists(fname_plt_hrp2)){
out_plt_hrp2 = sampling(lc_mod, data=stan_dat_plt_hrp2, thin=4)
save(out_plt_hrp2, file = fname_plt_hrp2)
} else {
load(fname_plt_hrp2)
}
traceplot(out_all,par=c('prev','mu','sigma'))
traceplot(out_plt_hrp2,par=c('prev','mu','sigma'))
thetas_all = extract(out_all)
thetas_plt_hrp2 = extract(out_plt_hrp2)
round(100*colMeans(thetas_all$prev))
## [1] 98 73 67
round(100*colMeans(thetas_plt_hrp2$prev))
## [1] 94 70 65
mean_vals = round(10^apply(thetas_all$mu, 2:3, mean))
colnames(mean_vals) = c('SM', 'not SM')
rownames(mean_vals) = c('Platelet count','PfHRP2','Parasite count')
print(mean_vals)
##
## SM not SM
## Platelet count 73 235
## PfHRP2 182 3247
## Parasite count 7584 78667
mean_vals2 = round(10^apply(thetas_plt_hrp2$mu, 2:3, mean))
colnames(mean_vals2) = c('SM', 'not SM')
rownames(mean_vals2) = c('Platelet count','PfHRP2')
print(mean_vals2)
##
## SM not SM
## Platelet count 70 231
## PfHRP2 196 3415
hist(colMeans(thetas_all$ps_SM),breaks = 20)
ind_SM = colMeans(thetas_all$ps_SM)>thresh_highSM
ind_notSM = colMeans(thetas_all$ps_SM)<thresh_lowSM
sum(ind_SM)
## [1] 1344
sum(ind_notSM)
## [1] 463
cor.test(stan_dat_all$y[ind_SM, 1],stan_dat_all$y[ind_SM,2])
##
## Pearson's product-moment correlation
##
## data: stan_dat_all$y[ind_SM, 1] and stan_dat_all$y[ind_SM, 2]
## t = -2.7007, df = 1342, p-value = 0.007007
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1264960 -0.0201302
## sample estimates:
## cor
## -0.07352218
cor.test(stan_dat_all$y[ind_SM, 2],stan_dat_all$y[ind_SM,3])
##
## Pearson's product-moment correlation
##
## data: stan_dat_all$y[ind_SM, 2] and stan_dat_all$y[ind_SM, 3]
## t = -4.3847, df = 1342, p-value = 1.252e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.17122533 -0.06578923
## sample estimates:
## cor
## -0.1188423
cor.test(stan_dat_all$y[ind_notSM, 1],stan_dat_all$y[ind_notSM,2])
##
## Pearson's product-moment correlation
##
## data: stan_dat_all$y[ind_notSM, 1] and stan_dat_all$y[ind_notSM, 2]
## t = 2.1711, df = 461, p-value = 0.03043
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.009562577 0.189993526
## sample estimates:
## cor
## 0.1006052
cor.test(stan_dat_all$y[ind_notSM, 2],stan_dat_all$y[ind_notSM,3])
##
## Pearson's product-moment correlation
##
## data: stan_dat_all$y[ind_notSM, 2] and stan_dat_all$y[ind_notSM, 3]
## t = 3.1228, df = 461, p-value = 0.001904
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.05349835 0.23201405
## sample estimates:
## cor
## 0.1439269
round(10^apply(thetas_plt_hrp2$mu, 2:3, mean))
##
## [,1] [,2]
## [1,] 70 231
## [2,] 196 3415
round(10^(apply(thetas_plt_hrp2$mu, 2:3, mean)+ 1.96*apply(thetas_plt_hrp2$sigma, 2:3, mean)))
##
## [,1] [,2]
## [1,] 280 822
## [2,] 5476 25279
round(10^(apply(thetas_plt_hrp2$mu, 2:3, mean)- 1.96*apply(thetas_plt_hrp2$sigma, 2:3, mean)))
##
## [,1] [,2]
## [1,] 18 65
## [2,] 7 461
par(mfrow=c(2,2), las=1)
plot(stan_dat_all$y[ind_SM, 1],stan_dat_all$y[ind_SM,2],
xlab='Platelet count',ylab='HRP2',main='True SM',
xlim=range(stan_dat_all$y[,1]),ylim=range(stan_dat_all$y[,2]),
col=dat_all$study_col[ind_SM],panel.first=grid())
legend('bottomright',col = unique(dat_all$study_col),
legend = unique(dat_all$study),pch=1,inset=0.03)
plot(stan_dat_all$y[ind_SM, 2],stan_dat_all$y[ind_SM,3],
xlab='HRP2', xlim=range(stan_dat_all$y[,2]),
ylim=range(stan_dat_all$y[,3]),
ylab='Parasite density',main='True SM',
col=dat_all$study_col[ind_SM],panel.first=grid())
plot(stan_dat_all$y[ind_notSM, 1],stan_dat_all$y[ind_notSM,2],
xlab='Platelet count',ylab='HRP2',main='Not SM',
xlim=range(stan_dat_all$y[,1]),ylim=range(stan_dat_all$y[,2]),
col=dat_all$study_col[ind_notSM],panel.first=grid())
plot(stan_dat_all$y[ind_notSM, 2],stan_dat_all$y[ind_notSM,3],
xlim=range(stan_dat_all$y[,2]),ylim=range(stan_dat_all$y[,3]),
xlab='HRP2',ylab='Parasite density',main='Not SM',
col=dat_all$study_col[ind_notSM],panel.first=grid())
Just use the two severe malaria studies from Africa
# make numeric matrix for stan input
ind_SMAfrica = dat_all$study != 'FEAST (Uganda)' &
dat_all$study != 'Bangladesh' &
!is.na(dat_all$log10_parasites)
X = dat_all[ind_SMAfrica,
c('log10_platelet','log10_hrp2',
'log10_parasites')]
site_index_SMAfrica = as.numeric(as.factor(dat_all$study[ind_SMAfrica]))
table(site_index_SMAfrica)
## site_index_SMAfrica
## 1 2
## 484 1398
stan_dat_all_Af =
list(N = nrow(X),
y = X,
Ntest = ncol(X),
K_sites = max(site_index_SMAfrica),
site = site_index_SMAfrica,
prior_mu = matrix(c(log10(75), log10(200),
log10(250),log10(3000),
3,5),
nrow = ncol(X), byrow = T),
prior_sigma_mu = matrix(c(0.1, 0.1,
0.1, 0.1,
0.2, 0.2),
nrow = ncol(X), byrow = T),
beta_prior_prev = matrix(data = c(14,7,
14,7),
nrow = max(site_index_SMAfrica),
byrow = T),
cluster = matrix(data = c(1,2,
2,1,
2,1),
nrow = ncol(X), byrow = T))
# just platelets and HRP2 (reference)
stan_dat_plt_hrp2_Af = stan_dat_all_Af
stan_dat_plt_hrp2_Af$y = X[,1:2]
stan_dat_plt_hrp2_Af$Ntest = 2
stan_dat_plt_hrp2_Af$prior_mu = stan_dat_all_Af$prior_mu[1:2,]
stan_dat_plt_hrp2_Af$prior_sigma_mu =stan_dat_all_Af$prior_sigma_mu[1:2,]
stan_dat_plt_hrp2_Af$cluster = stan_dat_all_Af$cluster[1:2,]
fname_all_Af = paste0('Rout/', 'mod2_LC_all_Af.stanfit')
if(RUN_STAN | !file.exists(fname_all_Af)){
out_all_Af = sampling(lc_mod, data=stan_dat_all_Af, thin=4)
save(out_all_Af, file = fname_all_Af)
} else {
load(fname_all_Af)
}
fname_plt_hrp2 = paste0('Rout/', 'mod2_LC_plt_hrp2_Af.stanfit')
if(RUN_STAN | !file.exists(fname_plt_hrp2)){
out_plt_hrp2_Af = sampling(lc_mod, data=stan_dat_plt_hrp2_Af, thin=4)
save(out_plt_hrp2_Af, file = fname_plt_hrp2)
} else {
load(fname_plt_hrp2)
}
traceplot(out_all_Af)
## 'pars' not specified. Showing first 10 parameters by default.
traceplot(out_plt_hrp2_Af)
## 'pars' not specified. Showing first 10 parameters by default.
thetas_all_Af = extract(out_all_Af)
thetas_plt_hrp2_Af = extract(out_plt_hrp2_Af)
round(100*colMeans(thetas_all_Af$prev))
## [1] 71 66
round(100*colMeans(thetas_plt_hrp2_Af$prev))
## [1] 69 64
mean_vals = round(10^apply(thetas_all_Af$mu, 2:3, mean))
colnames(mean_vals) = c('SM', 'not SM')
rownames(mean_vals) = c('Platelet count','PfHRP2','Parasite count')
print(mean_vals)
##
## SM not SM
## Platelet count 75 231
## PfHRP2 193 3398
## Parasite count 7617 80408
mean_vals2 = round(10^apply(thetas_plt_hrp2_Af$mu, 2:3, mean))
colnames(mean_vals2) = c('SM', 'not SM')
rownames(mean_vals2) = c('Platelet count','PfHRP2')
print(mean_vals2)
##
## SM not SM
## Platelet count 73 228
## PfHRP2 208 3556
par(mfcol=c(2,2),las=1,cex.lab=1.3,cex.axis=1.3,family='serif')
roc_analysis(thetas = thetas_all, stan_dat = stan_dat_all,
xvals = matrix(c(log10(10),log10(1000),
1,5,1,6.5),
nrow = 3,byrow = T),
xnames = c('Platelet count (x1000 per uL)',
'PfHRP2 (ng/mL)',
'Parasite count (per uL)'),
cutofftype = c('upper','lower','lower'),
mycuts = log10(c(150, 1000, 10000)))
## [1] "Platelet count (x1000 per uL)"
## [1] 84
## [1] 75
## [1] "PfHRP2 (ng/mL)"
## [1] 87
## [1] 84
## [1] "Parasite count (per uL)"
## [1] 84
## [1] 55
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter
# sensitivity analysis using African studies only
writeLines('Model using only African studies')
## Model using only African studies
roc_analysis(thetas = thetas_all_Af,
stan_dat = stan_dat_all_Af,
xvals = matrix(c(log10(10),log10(300),
1,5,1,6.3),
nrow = 3,byrow = T),
xnames = c('Platelet count (x1000 per uL)',
'PfHRP2 (ng/mL)',
'Parasite count (per uL)'),
cutofftype = c('upper','lower','lower'),
mycuts = log10(c(150, 1000, 10000)))
## [1] "Platelet count (x1000 per uL)"
## [1] 85
## [1] 74
## [1] "PfHRP2 (ng/mL)"
## [1] 89
## [1] 83
## [1] "Parasite count (per uL)"
## [1] 85
## [1] 55
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter
writeLines('Model using only platelets and HRP2')
## Model using only platelets and HRP2
par(mfcol=c(2,2),las=1,cex.lab=1.3,cex.axis=1.3,family='serif')
roc_analysis(thetas = thetas_plt_hrp2, stan_dat = stan_dat_plt_hrp2,
xvals = matrix(c(log10(10),log10(1000),1,5),
nrow = 2,byrow = T),
xnames = c('Platelet count (x1000 per uL)',
'PfHRP2 (ng/mL)'),
cutofftype = c('upper','lower'))
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter
lc3_mod = stan_model(file = 'Latent_class_continuous_3.stan')
This does not fit when we add parasite counts. So we are only using two biomarkers
# make numeric matrix for stan input
X = dat_all[, c('log10_platelet','log10_hrp2')]
site_index = as.numeric(as.factor(dat_all$study))
table(site_index, dat_all$study)
##
## site_index Bangladesh FEAST (Uganda) Kampala (Uganda) Kilifi (Kenya)
## 1 171 0 0 0
## 2 0 559 0 0
## 3 0 0 492 0
## 4 0 0 0 1400
stan_dat =
list(N = nrow(X),
y = X,
Ntest = ncol(X),
K_sites = max(site_index),
site = site_index,
prior_mu = matrix(c(log10(75), log10(200),log10(300),
log10(2),log10(250),log10(3000)),
nrow = ncol(X),ncol = 3, byrow = T),
prior_sigma_mu = matrix(c(0.1, 0.1, 0.1,
0.1, 0.1, 0.1),
nrow = ncol(X), ncol = 3, byrow = T),
alpha_prior = matrix(data = c(19, 1, .1,
3, 3, 3,
14, 7, 1,
14, 7, 1),
nrow = max(site_index),
byrow = T),
cluster = matrix(data = c(1,2,3,
3,2,1),
nrow = ncol(X), ncol = 3, byrow = T))
fname_final = paste0('Rout/', 'mod3_LC_final.stanfit')
if(RUN_STAN | !file.exists(fname_final)){
out_final = sampling(lc3_mod, data = stan_dat, iter=2000,thin=4)
save(out_final, file = fname_final)
} else {
load(fname_final)
}
traceplot(out_final, par=c('theta'))
thetas_final = extract(out_final)
writeLines('Estimated prevalences across the 4 studies:\n')
## Estimated prevalences across the 4 studies:
round(100*apply(thetas_final$theta[, ,1], 2,
quantile, probs=c(.025, 0.5, 0.975)),1)
## [,1] [,2] [,3] [,4]
## 2.5% 87.6 30.4 64.2 61.1
## 50% 93.8 35.1 69.7 64.7
## 97.5% 97.9 39.9 74.6 68.1
c_mean=round(10^apply(thetas_final$mu, 2:3, mean))
c_upper=round(10^(apply(thetas_final$mu, 2:3, mean)+ 1.96*apply(thetas_final$sigma, 2:3, mean)))
c_lower=round(10^(apply(thetas_final$mu, 2:3, mean)- 1.96*apply(thetas_final$sigma, 2:3, mean)))
writeLines(sprintf('In SM the 95%% pred interval for the platelet count is %s to %s with a mean of %s',c_lower[1,1],c_upper[1,1],c_mean[1,1] ))
## In SM the 95% pred interval for the platelet count is 21 to 235 with a mean of 70
writeLines(sprintf('In SM the 95%% pred interval for the PfHRP2 concentration is %s to %s with a mean of %s',c_lower[2,3],c_upper[2,3],c_mean[2,3] ))
## In SM the 95% pred interval for the PfHRP2 concentration is 570 to 19718 with a mean of 3352
ind_not_FEAST = stan_dat$site != 2
plot(colMeans(thetas_final$ps_SM)[ind_not_FEAST],
colMeans(thetas_plt_hrp2$ps_SM))
lines(0:1, 0:1)
Check conditional independence
ps = apply(thetas_final$ps_comp, 2:3, mean)
ind_SM = ps[,1]>0.5; sum(ind_SM)
## [1] 1635
ind_1 = ps[,2]>0.5; sum(ind_1)
## [1] 793
ind_2 = ps[,3]>0.5; sum(ind_2)
## [1] 194
par(mfrow=c(2,2), las=1)
plot(stan_dat$y[ind_SM, 1],stan_dat$y[ind_SM,2],
xlab='Platelet count (x1000 per uL)',
ylab='PfHRP2 (ng/mL)',main='Severe malaria',
xlim=range(stan_dat$y[,1]),ylim=range(stan_dat$y[,2]),
col = dat_all$study_col[ind_SM],panel.first=grid())
cor.test(stan_dat$y[ind_SM, 1],stan_dat$y[ind_SM,2])
##
## Pearson's product-moment correlation
##
## data: stan_dat$y[ind_SM, 1] and stan_dat$y[ind_SM, 2]
## t = -2.0615, df = 1633, p-value = 0.03941
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.09918144 -0.00247591
## sample estimates:
## cor
## -0.0509481
legend('bottomright',col = unique(dat_all$study_col),
legend = unique(dat_all$study),pch=1,inset=0.03)
plot(stan_dat$y[ind_1, 1],stan_dat$y[ind_1,2],
xlab='Platelet count (x1000 per uL)',
ylab='PfHRP2 (ng/mL)',main='Not severe malaria (component 1)',
xlim=range(stan_dat$y[,1]),ylim=range(stan_dat$y[,2]),
col = dat_all$study_col[ind_1],panel.first=grid())
cor.test(stan_dat$y[ind_1, 1],stan_dat$y[ind_1,2])
##
## Pearson's product-moment correlation
##
## data: stan_dat$y[ind_1, 1] and stan_dat$y[ind_1, 2]
## t = 1.2502, df = 791, p-value = 0.2116
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.02528914 0.11367676
## sample estimates:
## cor
## 0.04440863
plot(stan_dat$y[ind_2, 1],stan_dat$y[ind_2,2],
xlab='Platelet count (x1000 per uL)',
ylab='PfHRP2 (ng/mL)',main='Not severe malaria (component 2)',
xlim=range(stan_dat$y[,1]),ylim=range(stan_dat$y[,2]),
col = dat_all$study_col[ind_2],panel.first=grid())
cor.test(stan_dat$y[ind_2, 1],stan_dat$y[ind_2,2])
##
## Pearson's product-moment correlation
##
## data: stan_dat$y[ind_2, 1] and stan_dat$y[ind_2, 2]
## t = -0.68593, df = 192, p-value = 0.4936
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.18900064 0.09207392
## sample estimates:
## cor
## -0.04944223
ROC curves
par(mfcol=c(2,2),las=1,cex.lab=1.3,cex.axis=1.3,family='serif')
roc_analysis(thetas = thetas_final, stan_dat = stan_dat,
xvals = matrix(c(log10(10),log10(1000),1,5),
nrow = 2,byrow = T),
xnames = c('Platelet count (x1000 per uL)',
'PfHRP2 (ng/mL)'),
cutofftype = c('upper','lower'))
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter
Make a P(SM) variable for subsequent analysis (continuous) and a discrete classification based on a cutoff of 0.5
dat_all$P_SM = colMeans(thetas_final$ps_SM)
dat_all$SM = as.numeric(dat_all$P_SM>0.5)
hist(dat_all$P_SM, breaks = 20,main='Final model', xlab='P(SM)')
Relationship with the two main subphenotypes, cerebral malaria and severe malaria anaemia
writeLines('False diagnosis rate by coma status and by study:')
## False diagnosis rate by coma status and by study:
print(aggregate(SM ~ coma + study, data = dat_all, mean))
## coma study SM
## 1 0 Bangladesh 0.9841270
## 2 1 Bangladesh 0.9629630
## 3 0 FEAST (Uganda) 0.3333333
## 4 1 FEAST (Uganda) 0.4311927
## 5 0 Kampala (Uganda) 0.5131579
## 6 1 Kampala (Uganda) 0.8636364
## 7 0 Kilifi (Kenya) 0.7113402
## 8 1 Kilifi (Kenya) 0.6212121
writeLines('True diagnosis rate by severe anaemia status (<15% haematocrit) and by study:')
## True diagnosis rate by severe anaemia status (<15% haematocrit) and by study:
dat_all$SMA = as.numeric(dat_all$hct<=15)
print(aggregate(SM ~ SMA + study, data = dat_all,mean))
## SMA study SM
## 1 0 Bangladesh 0.9691358
## 2 1 Bangladesh 1.0000000
## 3 0 FEAST (Uganda) 0.3357488
## 4 1 FEAST (Uganda) 0.4125874
## 5 0 Kampala (Uganda) 0.8516949
## 6 1 Kampala (Uganda) 0.5625000
## 7 0 Kilifi (Kenya) 0.6199461
## 8 1 Kilifi (Kenya) 0.8280702
par(las = 1, family='serif',mfrow=c(2,2), cex.lab=1.3,
cex.axis=1.3)
n_levels = 11
f=colorRampPalette(colors = RColorBrewer::brewer.pal(name = 'RdYlBu',n = n_levels))
mycuts = cut(x = dat_all$P_SM, breaks = seq(0,1,by=1/n_levels))
cols_ind = as.numeric(mycuts)
mycols = f(n_levels)
for(cc in unique(dat_all$study)){
ind = dat_all$study==cc
plot(X, type='n',
panel.first=grid(),
xlab='Platelet count (x1000 per uL)',
ylab='PfHRP2 (ng/mL)', xaxt='n', yaxt='n')
points(X[ind, ], pch=1,
col= adjustcolor(mycols[cols_ind],.7)[ind])
points(X[ind, ], pch=16,
col= adjustcolor(mycols[cols_ind],.4)[ind])
if(cc %in% African_studies){
ind_AS = dat_all$sickle==2
points(X[ind&ind_AS, ],pch=2)
ind_SS = dat_all$sickle==3
points(X[ind&ind_SS, ],pch=3)
}
title(cc)
axis(1, at = log10(c(10, 20,100,200,1000)),
labels = c(10, 20,100,200,1000))
axis(1, at = log10(seq(20, 100, by =10)),
labels = NA)
axis(1, at = log10(seq(200, 1000, by =100)),
labels = NA)
axis(2, at = 0:5,
labels = c(1, 10,
expression(10^2),
expression(10^3),
expression(10^4),
expression(10^5)))
}
table(sickle=dat_all$sickle,
class=dat_all$SM,
study=dat_all$study)
## , , study = Bangladesh
##
## class
## sickle 0 1
## 1 0 0
## 2 0 0
## 3 0 0
##
## , , study = FEAST (Uganda)
##
## class
## sickle 0 1
## 1 284 182
## 2 40 6
## 3 20 1
##
## , , study = Kampala (Uganda)
##
## class
## sickle 0 1
## 1 126 337
## 2 2 2
## 3 19 4
##
## , , study = Kilifi (Kenya)
##
## class
## sickle 0 1
## 1 437 911
## 2 27 14
## 3 6 1
lgd_ = rep(NA, n_levels)
lgd_[c(1,6,11)] = c(1,0.5,0)
legend('bottomleft',
legend = lgd_,
fill = rev(mycols),bty='n',x.intersp =.5,
border = NA,inset = 0.01,
y.intersp = 0.5,title = '',
cex = 1, text.font = 1)
legend('bottomright',pch=2:3, legend = c('HbAS','HbSS'),inset=0.03)
### Parasite counts
ind_neg = dat_all$malaria_positive
par(las = 1, family='serif',mfrow=c(2,2), cex.lab=1.3, cex.axis=1.3)
for(cc in unique(dat_all$study)){
ind = dat_all$study==cc
set.seed(7475)
myxs=dat_all$P_SM[ind]
myys=log10(dat_all$para+10^1.7)[ind]
plot(myxs,myys, panel.first=grid(), ylim =c(1.5,6.5),
pch=1, xlim=c(0,1),
col = adjustcolor('black',.4),
xlab='Prob(Severe Malaria)',
ylab='Parasite density per uL',yaxt='n')
points(myxs,myys,pch=16,col = adjustcolor('black',.1))
# axis(1, at=1:3,labels = c('A','B','C'), tick = F)
axis(2, at = 2:6,
labels = c(expression(10^2),
expression(10^3),
expression(10^4),
expression(10^5),
expression(10^6)))
mod=gam(myys ~ s(myxs,k=3),
data=data.frame(myys=myys,myxs=myxs))
pxs=seq(0,1,length.out = 100)
lines(pxs,predict(mod, data.frame(myxs=pxs)),lwd=3)
title(cc)
}
par(las = 1, family='serif',mfrow=c(2,2), cex.lab=1.3, cex.axis=1.3)
for(cc in unique(dat_all$study)){
ind = dat_all$study==cc
set.seed(7475)
myxs=(dat_all$P_SM)[ind]
myys=jitter(dat_all$hct[ind], amount = .1)
plot(myxs,myys,panel.first=grid(),
ylim = range(dat_all$hct,na.rm = T),
pch=1, xlim=c(0,1),
col = adjustcolor('black',.4),
xlab='Prob(Severe Malaria)',
ylab='Haematocrit (%)')
points(myxs,myys,pch=16,col = adjustcolor('black',.1))
mod=gam(myys ~ s(myxs,k=3),
data=data.frame(myys=myys,myxs=myxs))
pxs=seq(0,1,length.out = 100)
lines(pxs,predict(mod, data.frame(myxs=pxs)),lwd=3)
title(cc)
}
hct_subgroups = aggregate(hct ~ SM + study,
dat_all, median)
print(hct_subgroups)
## SM study hct
## 1 0 Bangladesh 35.0
## 2 1 Bangladesh 27.3
## 3 0 FEAST (Uganda) 30.0
## 4 1 FEAST (Uganda) 19.2
## 5 0 Kampala (Uganda) 12.2
## 6 1 Kampala (Uganda) 15.8
## 7 0 Kilifi (Kenya) 27.5
## 8 1 Kilifi (Kenya) 19.4
# Severe anaemia
SA_subgroups = aggregate(hct ~ SM + study,
dat_all, function(x) round(100*mean(x<15),1))
print(SA_subgroups)
## SM study hct
## 1 0 Bangladesh 0.0
## 2 1 Bangladesh 4.2
## 3 0 FEAST (Uganda) 21.7
## 4 1 FEAST (Uganda) 27.8
## 5 0 Kampala (Uganda) 76.2
## 6 1 Kampala (Uganda) 41.7
## 7 0 Kilifi (Kenya) 10.2
## 8 1 Kilifi (Kenya) 24.6
ind_SM = dat_all$P_SM > thresh_highSM
ind_notSM = dat_all$P_SM < thresh_lowSM
par(mfcol=c(2,3))
for(cc in c("FEAST (Uganda)","Kampala (Uganda)", "Kilifi (Kenya)")){
ind = dat_all$study==cc
hist(dat_all$hct[ind_SM&ind], breaks = seq(0,50, by=1),
xlab='Haematocrit (%)', ylab='Number of patients',
main = paste0(cc,': ', 'P(SM)>0.8'),
panel.first=grid())
abline(v=median(dat_all$hct[ind_SM&ind],na.rm=T),lwd=2, col='red')
print(median(dat_all$hct[ind_SM&ind],na.rm=T))
hist(dat_all$hct[ind_notSM&ind], breaks = seq(0,50, by=1),
xlab='Haematocrit (%)', ylab='Number of patients',
main = paste0(cc,': ', 'P(SM)<0.2'),
panel.first=grid())
abline(v=median(dat_all$hct[ind_notSM&ind],na.rm=T),lwd=2, col='red')
print(median(dat_all$hct[ind_notSM&ind],na.rm=T))
}
## [1] 19.2
## [1] 31
## [1] 17
## [1] 11.6
## [1] 19.4
## [1] 28.35
P(Severe malaria) and mortality
par(las = 1, family='serif',mfrow=c(2,2), cex.lab=1.3, cex.axis=1.3)
for(cc in unique(dat_all$study)){
ind = dat_all$study==cc
set.seed(7475)
xs = seq(0,1,length.out = 100)
mod=glm(outcome ~ P_SM, family='binomial', data = dat_all[ind, ])
print(summary(mod))
preds=predict(mod,newdata = data.frame(P_SM=xs),se.fit = T)
plot(xs,100*inv.logit(preds$fit),panel.first=grid(),
type='l',lwd=2, col = adjustcolor('black',.4),
xlab='Prob(Severe Malaria)',
ylim = range(100*inv.logit(c(preds$fit+1.96*preds$se.fit,
rev(preds$fit-1.96*preds$se.fit)))),
ylab='Probability of death (%)')
polygon(x = c(xs, rev(xs)),
y = 100*inv.logit(c(preds$fit+1.96*preds$se.fit,
rev(preds$fit-1.96*preds$se.fit))),
col=adjustcolor('grey', .3), border = NA)
title(cc)
}
##
## Call:
## glm(formula = outcome ~ P_SM, family = "binomial", data = dat_all[ind,
## ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5023 -0.5023 -0.4983 -0.4766 2.1132
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.0067 0.1693 -11.854 <2e-16 ***
## P_SM -0.1129 0.3111 -0.363 0.717
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 397.79 on 558 degrees of freedom
## Residual deviance: 397.65 on 557 degrees of freedom
## AIC: 401.65
##
## Number of Fisher Scoring iterations: 4
##
## Call:
## glm(formula = outcome ~ P_SM, family = "binomial", data = dat_all[ind,
## ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4602 -0.4568 -0.4184 -0.1625 3.0445
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.6411 0.7967 -5.826 5.69e-09 ***
## P_SM 2.4517 0.8583 2.856 0.00428 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 242.07 on 491 degrees of freedom
## Residual deviance: 228.40 on 490 degrees of freedom
## AIC: 232.4
##
## Number of Fisher Scoring iterations: 7
##
## Call:
## glm(formula = outcome ~ P_SM, family = "binomial", data = dat_all[ind,
## ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5560 -0.5223 -0.4482 -0.4442 2.1756
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7887 0.1435 -12.465 <2e-16 ***
## P_SM -0.4800 0.1985 -2.418 0.0156 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 974.42 on 1399 degrees of freedom
## Residual deviance: 968.68 on 1398 degrees of freedom
## AIC: 972.68
##
## Number of Fisher Scoring iterations: 4
##
## Call:
## glm(formula = outcome ~ P_SM, family = "binomial", data = dat_all[ind,
## ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8060 -0.8059 -0.8013 1.6016 1.7483
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6335 1.1108 -1.471 0.141
## P_SM 0.6758 1.1645 0.580 0.562
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 199.14 on 170 degrees of freedom
## Residual deviance: 198.77 on 169 degrees of freedom
## AIC: 202.77
##
## Number of Fisher Scoring iterations: 4
Summary of lab values by inferred subgroup
age_subgroups = aggregate(age ~ SM + study,
dat_all, function(x) round(median(x),1))
print(age_subgroups)
## SM study age
## 1 0 Bangladesh 26.0
## 2 1 Bangladesh 30.0
## 3 0 FEAST (Uganda) 1.8
## 4 1 FEAST (Uganda) 2.4
## 5 0 Kampala (Uganda) 3.3
## 6 1 Kampala (Uganda) 3.3
## 7 0 Kilifi (Kenya) 2.3
## 8 1 Kilifi (Kenya) 2.4
para_subgroups = aggregate(log10_parasites ~ SM + study,
dat_all, function(x) round(mean(x),1))
para_subgroups$log10_parasites=round(10^para_subgroups$log10_parasites)
print(para_subgroups)
## SM study log10_parasites
## 1 0 Bangladesh 100000
## 2 1 Bangladesh 79433
## 3 0 FEAST (Uganda) 316
## 4 1 FEAST (Uganda) 39811
## 5 0 Kampala (Uganda) 12589
## 6 1 Kampala (Uganda) 50119
## 7 0 Kilifi (Kenya) 12589
## 8 1 Kilifi (Kenya) 79433
dat_all$log10_wbc = log10(dat_all$wbc)
wbc_subgroups = aggregate(log10_wbc ~ SM + study,
dat_all, function(x) round(mean(x),1))
wbc_subgroups$log10_wbc=round(10^wbc_subgroups$log10_wbc)
print(wbc_subgroups)
## SM study log10_wbc
## 1 0 Bangladesh 8
## 2 1 Bangladesh 8
## 3 0 FEAST (Uganda) 13
## 4 1 FEAST (Uganda) 10
## 5 0 Kampala (Uganda) 13
## 6 1 Kampala (Uganda) 10
## 7 0 Kilifi (Kenya) 16
## 8 1 Kilifi (Kenya) 13
hbb_subgroups = aggregate(sickle ~ SM + study,
dat_all, table)
print(hbb_subgroups)
## SM study sickle.1 sickle.2 sickle.3
## 1 0 FEAST (Uganda) 284 40 20
## 2 1 FEAST (Uganda) 182 6 1
## 3 0 Kampala (Uganda) 126 2 19
## 4 1 Kampala (Uganda) 337 2 4
## 5 0 Kilifi (Kenya) 437 27 6
## 6 1 Kilifi (Kenya) 911 14 1
death_subgroups = aggregate(outcome ~ SM + study,
dat_all, function(x) round(100*mean(x),1))
print(death_subgroups)
## SM study outcome
## 1 0 Bangladesh 0.0
## 2 1 Bangladesh 27.7
## 3 0 FEAST (Uganda) 11.6
## 4 1 FEAST (Uganda) 11.1
## 5 0 Kampala (Uganda) 1.4
## 6 1 Kampala (Uganda) 9.0
## 7 0 Kilifi (Kenya) 13.9
## 8 1 Kilifi (Kenya) 9.6
par(las = 1, family='serif',mfrow=c(2,2), cex.lab=1.3, cex.axis=1.3)
for(cc in unique(dat_all$study)){
ind = dat_all$study==cc
set.seed(7475)
myxs=(dat_all$P_SM)[ind]
myys=log10(dat_all$wbc[ind])
plot(myxs,myys,panel.first=grid(),
ylim = range(log10(dat_all$wbc),na.rm = T),
pch=1, xlim=c(0,1), yaxt='n',
col = adjustcolor('black',.4),
xlab='Prob(Severe Malaria)',
ylab='White blood cell count (x1000 per uL)')
axis(2, at = log10(c(1,3,10,30,100)), labels = c(1,3,10,30,100))
points(myxs,myys,pch=16,col = adjustcolor('black',.1))
mod=gam(myys ~ s(myxs,k=3),
data=data.frame(myys=myys,myxs=myxs))
pxs=seq(0,1,length.out = 100)
lines(pxs,predict(mod, data.frame(myxs=pxs)),lwd=3)
title(cc)
}
mod_wbc = lm(log10_wbc ~ study + SM + age, data = dat_all)
xx=summary(mod_wbc)
xx$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.287494636 0.045524542 28.281331 9.516183e-154
## studyFEAST (Uganda) -0.124753261 0.043503756 -2.867643 4.168538e-03
## studyKampala (Uganda) -0.142630933 0.042142174 -3.384518 7.235529e-04
## studyKilifi (Kenya) -0.069474443 0.042020916 -1.653330 9.838379e-02
## SM -0.121163979 0.010854018 -11.163053 2.669766e-28
## age -0.006631417 0.001184684 -5.597624 2.399319e-08
10^(-xx$coefficients['SM',1])
## [1] 1.321795
mod_wbc2 = gam(as.numeric(wbc>15) ~ study + SM + s(age,k=3),
family='binomial',
data = dat_all)
xx=summary(mod_wbc2)
xx$p.pv
## (Intercept) studyFEAST (Uganda) studyKampala (Uganda)
## 3.872369e-05 4.663045e-07 1.014519e-07
## studyKilifi (Kenya) SM
## 9.244550e-06 2.111265e-17
round(exp(xx$coefficients['SM',1] + c(-1,0,1)*xx$coefficients['SM',2]),2)
## numeric(0)
table(dat_all$study, dat_all$wbc > 15)
##
## FALSE TRUE
## Bangladesh 149 22
## FEAST (Uganda) 354 204
## Kampala (Uganda) 365 126
## Kilifi (Kenya) 870 530
Relationship between the blood culture data and the probability of severe malaria. We also look at whether WBC values predict bacteraemia
mod=glm(BS ~ SM, family='binomial',data = dat_all)
table(dat_all$SM, dat_all$BS)
##
## 0 1
## 0 445 29
## 1 904 22
summary(mod)
##
## Call:
## glm(formula = BS ~ SM, family = "binomial", data = dat_all)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.3553 -0.3553 -0.2193 -0.2193 2.7349
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.7308 0.1917 -14.249 < 2e-16 ***
## SM -0.9850 0.2886 -3.413 0.000642 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 437.98 on 1399 degrees of freedom
## Residual deviance: 426.26 on 1398 degrees of freedom
## (1222 observations deleted due to missingness)
## AIC: 430.26
##
## Number of Fisher Scoring iterations: 6
xx=summary(mod)$coefficients
exp(xx[2,1] + c(-1,0,1)*1.96* xx[2,2])
## [1] 0.2121076 0.3734361 0.6574707
summary(glm(BS ~ log10_wbc, family='binomial', data = dat_all))
##
## Call:
## glm(formula = BS ~ log10_wbc, family = "binomial", data = dat_all)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.2857 -0.2745 -0.2719 -0.2697 2.6151
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.3902 0.6290 -5.390 7.06e-08 ***
## log10_wbc 0.1024 0.5442 0.188 0.851
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 437.98 on 1399 degrees of freedom
## Residual deviance: 437.95 on 1398 degrees of freedom
## (1222 observations deleted due to missingness)
## AIC: 441.95
##
## Number of Fisher Scoring iterations: 6
summary(glm(BS ~ log10_wbc, family='binomial',
data = dat_all[dat_all$P_SM>0.8,]))
##
## Call:
## glm(formula = BS ~ log10_wbc, family = "binomial", data = dat_all[dat_all$P_SM >
## 0.8, ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.2379 -0.1912 -0.1789 -0.1654 2.9721
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1534 1.2629 -2.497 0.0125 *
## log10_wbc -0.9213 1.2119 -0.760 0.4471
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 133.13 on 806 degrees of freedom
## Residual deviance: 132.53 on 805 degrees of freedom
## (630 observations deleted due to missingness)
## AIC: 136.53
##
## Number of Fisher Scoring iterations: 7
summary(glm(BS ~ log10_wbc, family='binomial',
data = dat_all[dat_all$P_SM<0.2,]))
##
## Call:
## glm(formula = BS ~ log10_wbc, family = "binomial", data = dat_all[dat_all$P_SM <
## 0.2, ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4363 -0.3724 -0.3506 -0.3242 2.5366
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.9199 0.9723 -1.975 0.0483 *
## log10_wbc -0.6903 0.8159 -0.846 0.3975
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 173.34 on 377 degrees of freedom
## Residual deviance: 172.62 on 376 degrees of freedom
## (435 observations deleted due to missingness)
## AIC: 176.62
##
## Number of Fisher Scoring iterations: 5
ind = dat_all$sickle%in% 1:2
mod=glm(SM ~ sickle + study, family='binomial',
data.frame(SM=dat_all$SM[ind],
sickle=as.numeric(dat_all$sickle==2)[ind],
study = as.factor(dat_all$study)[ind]))
xx=summary(mod)$coefficients
xx
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4480304 0.09350842 -4.791338 1.656729e-06
## sickle -1.3855045 0.25720068 -5.386862 7.169854e-08
## studyKampala (Uganda) 1.4361129 0.13963506 10.284759 8.254826e-25
## studyKilifi (Kenya) 1.1824618 0.10908370 10.839950 2.225938e-27
round(exp(xx['sickle','Estimate']+(c(-1,0,1)*1.96*xx['sickle','Std. Error'])),2)
## [1] 0.15 0.25 0.41
for(ss in African_studies){
ind = dat_all$study==ss & dat_all$sickle%in% 1:2
mod=glm(SM ~ sickle, family='binomial',
data.frame(SM=dat_all$SM[ind],
sickle=as.numeric(dat_all$sickle==2)[ind]))
xx=summary(mod)$coefficients
print(ss)
print(round(exp(xx['sickle','Estimate']+(c(-1,0,1)*1.96*xx['sickle','Std. Error'])),2))
}
## [1] "Kilifi (Kenya)"
## [1] 0.13 0.25 0.48
## [1] "Kampala (Uganda)"
## [1] 0.05 0.37 2.68
## [1] "FEAST (Uganda)"
## [1] 0.10 0.23 0.56
ind = dat_all$sickle%in% c(1,3)
table(dat_all$SM, dat_all$sickle, dat_all$study)
## , , = Bangladesh
##
##
## 1 2 3
## 0 0 0 0
## 1 0 0 0
##
## , , = FEAST (Uganda)
##
##
## 1 2 3
## 0 284 40 20
## 1 182 6 1
##
## , , = Kampala (Uganda)
##
##
## 1 2 3
## 0 126 2 19
## 1 337 2 4
##
## , , = Kilifi (Kenya)
##
##
## 1 2 3
## 0 437 27 6
## 1 911 14 1
mod=glm(SM ~ sickle + study, family='binomial',
data.frame(SM=dat_all$SM[ind],
sickle=as.numeric(dat_all$sickle==3)[ind],
study = as.factor(dat_all$study)[ind]))
xx=summary(mod)$coefficients
xx
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4450513 0.09462094 -4.703518 2.557161e-06
## sickle -2.5409675 0.44759858 -5.676889 1.371664e-08
## studyKampala (Uganda) 1.4288183 0.14000062 10.205800 1.867760e-24
## studyKilifi (Kenya) 1.1797031 0.11100124 10.627837 2.211831e-26
round(exp(xx['sickle','Estimate']+(c(-1,0,1)*1.96*xx['sickle','Std. Error'])),2)
## [1] 0.03 0.08 0.19
for(ss in African_studies){
ind = dat_all$study==ss & dat_all$sickle%in% c(1,3)
mod=glm(SM ~ sickle, family='binomial',
data.frame(SM=dat_all$SM[ind],
sickle=as.numeric(dat_all$sickle==3)[ind]))
xx=summary(mod)$coefficients
print(ss)
print(round(exp(xx['sickle','Estimate']+(c(-1,0,1)*1.96*xx['sickle','Std. Error'])),2))
}
## [1] "Kilifi (Kenya)"
## [1] 0.01 0.08 0.67
## [1] "Kampala (Uganda)"
## [1] 0.03 0.08 0.24
## [1] "FEAST (Uganda)"
## [1] 0.01 0.08 0.59
# Additive model
mod=glm(SM ~ sickle + study, family='binomial',
data.frame(SM=dat_all$SM,
sickle=dat_all$sickle,
study = as.factor(dat_all$study)))
xx=summary(mod)$coefficients
xx
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.8716288 0.2050853 4.250080 2.136943e-05
## sickle -1.3219401 0.1722037 -7.676607 1.633571e-14
## studyKampala (Uganda) 1.4410207 0.1378247 10.455464 1.383197e-25
## studyKilifi (Kenya) 1.1831195 0.1086905 10.885220 1.355712e-27
round(exp(xx['sickle','Estimate']+(c(-1,0,1)*1.96*xx['sickle','Std. Error'])),2)
## [1] 0.19 0.27 0.37
mclust
dat_all$log10_para = log10(dat_all$para+50)
dat_all$log10_wbc = log10(dat_all$wbc)
ind = !is.na(dat_all$log10_hrp2)&!is.na(dat_all$log10_platelet)
sum(ind)
## [1] 2622
table(dat_all$study[ind])
##
## Bangladesh FEAST (Uganda) Kampala (Uganda) Kilifi (Kenya)
## 171 559 492 1400
X = dat_all[ind, c('log10_hrp2','log10_platelet')]
mclust_mods = Mclust(data = X, G=3)
summary(mclust_mods)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model with 3
## components:
##
## log-likelihood n df BIC ICL
## -4041.15 2622 17 -8216.118 -9016.347
##
## Clustering table:
## 1 2 3
## 865 1563 194
table(dat_all$study[ind],mclust_mods$classification)
##
## 1 2 3
## Bangladesh 27 144 0
## FEAST (Uganda) 173 195 191
## Kampala (Uganda) 171 321 0
## Kilifi (Kenya) 494 903 3
table(dat_all$sickle[ind],mclust_mods$classification)
##
## 1 2 3
## 1 752 1383 142
## 2 41 21 29
## 3 31 5 15
comp_cols = brewer.pal(n = mclust_mods$G, name = 'Dark2')
pairs(X, col = adjustcolor(comp_cols[mclust_mods$classification],.4),
pch=16)
model1_probs = read.csv(file = 'Kilifi_Model1_Probabilities.csv')
model1_probs = model1_probs %>% arrange(sample_code) %>% filter(sample_code %in% dat_all$IDs)
dat_Kilifi = dat_all %>% filter(study=='Kilifi (Kenya)')%>%arrange(IDs)
par(las=1, family='serif', cex.lab=1.3, cex.axis=1.3)
plot(model1_probs$P_SM1, dat_Kilifi$P_SM,
xlab='Prob(Severe Malaria): platelet/white count model',
ylab='Prob(Severe Malaria): platelet/PfHRP2 model',
panel.first=grid(), main = 'Kilifi (Kenya)')
abline(lm(dat_Kilifi$P_SM~model1_probs$P_SM1), col='red',lwd=3)
hist(model1_probs$P_SM1 - dat_Kilifi$P_SM)
cor.test(model1_probs$P_SM1, dat_Kilifi$P_SM)
##
## Pearson's product-moment correlation
##
## data: model1_probs$P_SM1 and dat_Kilifi$P_SM
## t = 36.21, df = 1398, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6676275 0.7217688
## sample estimates:
## cor
## 0.6956848
Save output
# dat_kemri = dat_all[dat_all$study=='Kilifi (Kenya)', ]
# save(dat_kemri, file = 'kemri_PSM.RData')
We use the model of platelet counts and HRP2, not including the FEAST study (this is more conservative in its estimated operating characteristics)
source('functions.R')
x1=seq(100,200,by=1)
x2=seq(500, 1500, by=1)
my_thresh=expand.grid(plt_thresh = x1,
hrp2_thresh = x2)
my_thresh$sens = my_thresh$sp = NA
for(k in 1:nrow(my_thresh)){
out=roc_output(thetas = thetas_plt_hrp2,
stan_dat = stan_dat_plt_hrp2,
cutofftype = c('upper','lower'),
mycuts = log10(unlist(my_thresh[k,1:2])))
my_thresh$sens[k] = out[1]
my_thresh$sp[k] = out[2]
}
my_thresh$FP = round(100*(1-my_thresh$sp), 2)
my_thresh$TP = round(100*my_thresh$sens, 2)
xs = unique(my_thresh$FP)
ys = array(dim = length(xs))
for(i in 1:length(xs)){
ind = my_thresh$FP == xs[i]
ys[i] = max(my_thresh$TP[ind])
}
my_fp = 5.2
my_tp = 79
Plot results
f=smooth.spline(x = xs, y = ys,df = 5)
par(las=1, family='serif', cex.lab=1.3, cex.axis=1.3,mfrow=c(2,2))
z_FP = dcast(data = my_thresh, formula = hrp2_thresh~plt_thresh,
value.var='FP')
range(z_FP[, -1])
## [1] 1.1 12.0
image(y = x1, x=x2, z=as.matrix(z_FP[, -1]),
breaks = seq(0,15,length.out = 16),
main='False positive rate',yaxt='n',
col=hcl.colors(15, "YlGn", rev = TRUE),
xlab='PfHRP2 (ng/mL)',ylab='Platelet count (x1000 per uL)')
axis(2, at = c(100,150,200))
grid()
abline(v=800, h=150,lty=2)
z_TP = dcast(data = my_thresh, formula = hrp2_thresh~plt_thresh, value.var='TP')
range(z_TP[, -1])
## [1] 54.5 90.3
image(y = x1, x=x2, z=as.matrix(z_TP[, -1]),
breaks = seq(50,91,length.out = 16),panel.first=grid(),
main='True positive rate',yaxt='n',
col=hcl.colors(15, "Blue-Red 3", rev = TRUE),
xlab='PfHRP2 (ng/mL)',ylab='Platelet count (x1000 per uL)')
axis(2, at = c(100,150,200))
grid()
abline(v=800, h=150,lty=2)
plot(f$x, f$y, panel.first=grid(),
xlab = 'False positive rate (%)',
ylab = 'True positive rate (%)',
xlim = c(0,15), ylim = c(50, 100), type='l',lwd=2)
abline(v=5, h=80,lty=2)
ind = my_thresh$FP < my_fp & my_thresh$TP > my_tp
writeLines(sprintf('There are %s combinations that have a true positive rate greater than %s and a false positive rate less than %s',
sum(ind), my_tp, my_fp))
## There are 62 combinations that have a true positive rate greater than 79 and a false positive rate less than 5.2
print(my_thresh[ind, ])
## plt_thresh hrp2_thresh sp sens FP TP
## 22871 144 726 0.9485039 0.7905506 5.1 79.1
## 24185 145 739 0.9485109 0.7908446 5.1 79.1
## 24286 145 740 0.9485663 0.7906988 5.1 79.1
## 24387 145 741 0.9486216 0.7905530 5.1 79.1
## 25499 146 752 0.9485189 0.7910760 5.1 79.1
## 25600 146 753 0.9485737 0.7909287 5.1 79.1
## 25701 146 754 0.9486283 0.7907813 5.1 79.1
## 25802 146 755 0.9486829 0.7906338 5.1 79.1
## 26813 147 765 0.9485279 0.7912465 5.1 79.1
## 26914 147 766 0.9485820 0.7910977 5.1 79.1
## 27015 147 767 0.9486361 0.7909487 5.1 79.1
## 27116 147 768 0.9486900 0.7907997 5.1 79.1
## 27217 147 769 0.9487438 0.7906506 5.1 79.1
## 27318 147 770 0.9487976 0.7905014 5.1 79.1
## 28127 148 778 0.9485379 0.7913574 5.1 79.1
## 28228 148 779 0.9485915 0.7912071 5.1 79.1
## 28329 148 780 0.9486449 0.7910567 5.1 79.1
## 28430 148 781 0.9486982 0.7909063 5.1 79.1
## 28531 148 782 0.9487514 0.7907557 5.1 79.1
## 28632 148 783 0.9488046 0.7906051 5.1 79.1
## 29441 149 791 0.9485491 0.7914102 5.1 79.1
## 29542 149 792 0.9486020 0.7912585 5.1 79.1
## 29643 149 793 0.9486549 0.7911067 5.1 79.1
## 29744 149 794 0.9487076 0.7909549 5.1 79.1
## 29845 149 795 0.9487602 0.7908029 5.1 79.1
## 29946 149 796 0.9488128 0.7906509 5.1 79.1
## 30654 150 803 0.9485090 0.7915594 5.1 79.2
## 30755 150 804 0.9485614 0.7914064 5.1 79.1
## 30856 150 805 0.9486137 0.7912533 5.1 79.1
## 30957 150 806 0.9486660 0.7911002 5.1 79.1
## 31058 150 807 0.9487182 0.7909470 5.1 79.1
## 31159 150 808 0.9487702 0.7907937 5.1 79.1
## 31260 150 809 0.9488222 0.7906403 5.1 79.1
## 31968 151 816 0.9485230 0.7915016 5.1 79.2
## 32069 151 817 0.9485749 0.7913473 5.1 79.1
## 32170 151 818 0.9486267 0.7911929 5.1 79.1
## 32271 151 819 0.9486784 0.7910384 5.1 79.1
## 32372 151 820 0.9487299 0.7908839 5.1 79.1
## 32473 151 821 0.9487814 0.7907293 5.1 79.1
## 32574 151 822 0.9488328 0.7905747 5.1 79.1
## 33282 152 829 0.9485383 0.7913898 5.1 79.1
## 33383 152 830 0.9485896 0.7912342 5.1 79.1
## 33484 152 831 0.9486408 0.7910786 5.1 79.1
## 33585 152 832 0.9486920 0.7909229 5.1 79.1
## 33686 152 833 0.9487430 0.7907671 5.1 79.1
## 33787 152 834 0.9487939 0.7906112 5.1 79.1
## 34495 153 841 0.9485040 0.7913821 5.1 79.1
## 34596 153 842 0.9485549 0.7912254 5.1 79.1
## 34697 153 843 0.9486056 0.7910686 5.1 79.1
## 34798 153 844 0.9486563 0.7909117 5.1 79.1
## 34899 153 845 0.9487069 0.7907548 5.1 79.1
## 35000 153 846 0.9487573 0.7905978 5.1 79.1
## 35809 154 854 0.9485223 0.7911675 5.1 79.1
## 35910 154 855 0.9485727 0.7910096 5.1 79.1
## 36011 154 856 0.9486229 0.7908517 5.1 79.1
## 36112 154 857 0.9486730 0.7906937 5.1 79.1
## 36213 154 858 0.9487230 0.7905356 5.1 79.1
## 37123 155 867 0.9485420 0.7909029 5.1 79.1
## 37224 155 868 0.9485917 0.7907439 5.1 79.1
## 37325 155 869 0.9486414 0.7905848 5.1 79.1
## 38336 156 879 0.9485135 0.7907496 5.1 79.1
## 38437 156 880 0.9485629 0.7905895 5.1 79.1
## Legend plot
plot(NA,NA,xlim=0:1,ylim=0:1,yaxt='n',xaxt='n',ylab='',xlab='',bty='n')
lgd_ = rep(NA, 15)
lgd_[c(1,8,15)] = c(0,8,15)
legend('left',title = 'False positive rate\n',
legend = lgd_,
fill = hcl.colors(15, "YlGn", rev = TRUE),
bty='n',x.intersp =.5,
border = NA,inset = 0.01,
y.intersp = 0.5,
cex = 1, text.font = 1)
lgd_ = rep(NA, 15)
lgd_[c(1,8,15)] = c(90,70,50)
legend('right',title = 'True positive rate\n',
legend = lgd_,
fill = hcl.colors(15, "Blue-Red 3", rev = F),
bty='n',x.intersp =.5,
border = NA,inset = 0.01,
y.intersp = 0.5,
cex = 1, text.font = 1)